In [325]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
In [326]:
pd.set_option('display.max_columns',None)
data=pd.read_csv("Life Expectancy Data.csv")
In [327]:
data.head()
Out[327]:
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | 19.1 | 83 | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | 18.6 | 86 | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | 18.1 | 89 | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | 17.6 | 93 | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | 17.2 | 97 | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 |
In [328]:
data.shape
Out[328]:
(2938, 22)
In [329]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2938 entries, 0 to 2937 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2938 non-null object 1 Year 2938 non-null int64 2 Status 2938 non-null object 3 Life expectancy 2928 non-null float64 4 Adult Mortality 2928 non-null float64 5 infant deaths 2938 non-null int64 6 Alcohol 2744 non-null float64 7 percentage expenditure 2938 non-null float64 8 Hepatitis B 2385 non-null float64 9 Measles 2938 non-null int64 10 BMI 2904 non-null float64 11 under-five deaths 2938 non-null int64 12 Polio 2919 non-null float64 13 Total expenditure 2712 non-null float64 14 Diphtheria 2919 non-null float64 15 HIV/AIDS 2938 non-null float64 16 GDP 2490 non-null float64 17 Population 2286 non-null float64 18 thinness 1-19 years 2904 non-null float64 19 thinness 5-9 years 2904 non-null float64 20 Income composition of resources 2771 non-null float64 21 Schooling 2775 non-null float64 dtypes: float64(16), int64(4), object(2) memory usage: 505.1+ KB
In [330]:
data1=data.copy()
In [331]:
data1.shape
Out[331]:
(2938, 22)
In [332]:
data1.isnull().sum()
Out[332]:
Country 0 Year 0 Status 0 Life expectancy 10 Adult Mortality 10 infant deaths 0 Alcohol 194 percentage expenditure 0 Hepatitis B 553 Measles 0 BMI 34 under-five deaths 0 Polio 19 Total expenditure 226 Diphtheria 19 HIV/AIDS 0 GDP 448 Population 652 thinness 1-19 years 34 thinness 5-9 years 34 Income composition of resources 167 Schooling 163 dtype: int64
In [333]:
data1.isnull().sum().sum()
Out[333]:
2563
In [334]:
data1['Country'].value_counts()
Out[334]:
Country
Afghanistan 16
Peru 16
Nicaragua 16
Niger 16
Nigeria 16
..
Niue 1
San Marino 1
Nauru 1
Saint Kitts and Nevis 1
Dominica 1
Name: count, Length: 193, dtype: int64
In [335]:
data1.describe().T
Out[335]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Year | 2938.0 | 2.007519e+03 | 4.613841e+00 | 2000.00000 | 2004.000000 | 2.008000e+03 | 2.012000e+03 | 2.015000e+03 |
| Life expectancy | 2928.0 | 6.922493e+01 | 9.523867e+00 | 36.30000 | 63.100000 | 7.210000e+01 | 7.570000e+01 | 8.900000e+01 |
| Adult Mortality | 2928.0 | 1.647964e+02 | 1.242921e+02 | 1.00000 | 74.000000 | 1.440000e+02 | 2.280000e+02 | 7.230000e+02 |
| infant deaths | 2938.0 | 3.030395e+01 | 1.179265e+02 | 0.00000 | 0.000000 | 3.000000e+00 | 2.200000e+01 | 1.800000e+03 |
| Alcohol | 2744.0 | 4.602861e+00 | 4.052413e+00 | 0.01000 | 0.877500 | 3.755000e+00 | 7.702500e+00 | 1.787000e+01 |
| percentage expenditure | 2938.0 | 7.382513e+02 | 1.987915e+03 | 0.00000 | 4.685343 | 6.491291e+01 | 4.415341e+02 | 1.947991e+04 |
| Hepatitis B | 2385.0 | 8.094046e+01 | 2.507002e+01 | 1.00000 | 77.000000 | 9.200000e+01 | 9.700000e+01 | 9.900000e+01 |
| Measles | 2938.0 | 2.419592e+03 | 1.146727e+04 | 0.00000 | 0.000000 | 1.700000e+01 | 3.602500e+02 | 2.121830e+05 |
| BMI | 2904.0 | 3.832125e+01 | 2.004403e+01 | 1.00000 | 19.300000 | 4.350000e+01 | 5.620000e+01 | 8.730000e+01 |
| under-five deaths | 2938.0 | 4.203574e+01 | 1.604455e+02 | 0.00000 | 0.000000 | 4.000000e+00 | 2.800000e+01 | 2.500000e+03 |
| Polio | 2919.0 | 8.255019e+01 | 2.342805e+01 | 3.00000 | 78.000000 | 9.300000e+01 | 9.700000e+01 | 9.900000e+01 |
| Total expenditure | 2712.0 | 5.938190e+00 | 2.498320e+00 | 0.37000 | 4.260000 | 5.755000e+00 | 7.492500e+00 | 1.760000e+01 |
| Diphtheria | 2919.0 | 8.232408e+01 | 2.371691e+01 | 2.00000 | 78.000000 | 9.300000e+01 | 9.700000e+01 | 9.900000e+01 |
| HIV/AIDS | 2938.0 | 1.742103e+00 | 5.077785e+00 | 0.10000 | 0.100000 | 1.000000e-01 | 8.000000e-01 | 5.060000e+01 |
| GDP | 2490.0 | 7.483158e+03 | 1.427017e+04 | 1.68135 | 463.935626 | 1.766948e+03 | 5.910806e+03 | 1.191727e+05 |
| Population | 2286.0 | 1.275338e+07 | 6.101210e+07 | 34.00000 | 195793.250000 | 1.386542e+06 | 7.420359e+06 | 1.293859e+09 |
| thinness 1-19 years | 2904.0 | 4.839704e+00 | 4.420195e+00 | 0.10000 | 1.600000 | 3.300000e+00 | 7.200000e+00 | 2.770000e+01 |
| thinness 5-9 years | 2904.0 | 4.870317e+00 | 4.508882e+00 | 0.10000 | 1.500000 | 3.300000e+00 | 7.200000e+00 | 2.860000e+01 |
| Income composition of resources | 2771.0 | 6.275511e-01 | 2.109036e-01 | 0.00000 | 0.493000 | 6.770000e-01 | 7.790000e-01 | 9.480000e-01 |
| Schooling | 2775.0 | 1.199279e+01 | 3.358920e+00 | 0.00000 | 10.100000 | 1.230000e+01 | 1.430000e+01 | 2.070000e+01 |
In [336]:
data1.describe().sum()
Out[336]:
Year 1.498913e+04 Life expectancy 3.342949e+03 Adult Mortality 4.387089e+03 infant deaths 4.911230e+03 Alcohol 2.782870e+03 percentage expenditure 2.565521e+04 Hepatitis B 2.857010e+03 Measles 2.293851e+05 BMI 3.169665e+03 under-five deaths 5.672481e+03 Polio 3.394978e+03 Total expenditure 2.755914e+03 Diphtheria 3.394041e+03 HIV/AIDS 2.996520e+03 GDP 1.515594e+05 Population 1.376630e+09 thinness 1-19 years 2.953160e+03 thinness 5-9 years 2.954079e+03 Income composition of resources 2.774735e+03 Schooling 2.847752e+03 dtype: float64
In [337]:
data1.duplicated().sum()
Out[337]:
0
In [338]:
for i in data.select_dtypes(include='number').columns:
sns.histplot(data=data,x=i)
plt.show()
In [339]:
sns.pairplot(data)
Out[339]:
<seaborn.axisgrid.PairGrid at 0x1b7ca3b9d30>
In [340]:
for i in data.select_dtypes(include='number').columns:
sns.boxplot(data=data,x=i)
plt.show()
In [341]:
data['Country'].unique()
Out[341]:
array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
'Burkina Faso', 'Burundi', "Côte d'Ivoire", 'Cabo Verde',
'Cambodia', 'Cameroon', 'Canada', 'Central African Republic',
'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo',
'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus',
'Czechia', "Democratic People's Republic of Korea",
'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia',
'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala',
'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras',
'Hungary', 'Iceland', 'India', 'Indonesia',
'Iran (Islamic Republic of)', 'Iraq', 'Ireland', 'Israel', 'Italy',
'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati',
'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic",
'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Lithuania',
'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',
'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius',
'Mexico', 'Micronesia (Federated States of)', 'Monaco', 'Mongolia',
'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',
'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua',
'Niger', 'Nigeria', 'Niue', 'Norway', 'Oman', 'Pakistan', 'Palau',
'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines',
'Poland', 'Portugal', 'Qatar', 'Republic of Korea',
'Republic of Moldova', 'Romania', 'Russian Federation', 'Rwanda',
'Saint Kitts and Nevis', 'Saint Lucia',
'Saint Vincent and the Grenadines', 'Samoa', 'San Marino',
'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia',
'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia',
'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan',
'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden',
'Switzerland', 'Syrian Arab Republic', 'Tajikistan', 'Thailand',
'The former Yugoslav republic of Macedonia', 'Timor-Leste', 'Togo',
'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey',
'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine',
'United Arab Emirates',
'United Kingdom of Great Britain and Northern Ireland',
'United Republic of Tanzania', 'United States of America',
'Uruguay', 'Uzbekistan', 'Vanuatu',
'Venezuela (Bolivarian Republic of)', 'Viet Nam', 'Yemen',
'Zambia', 'Zimbabwe'], dtype=object)
In [342]:
for i in [ 'Adult Mortality', 'Alcohol',
'percentage expenditure', 'Hepatitis B', ' BMI ', 'Polio',
'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
' thinness 1-19 years', ' thinness 5-9 years',
'Income composition of resources', 'Schooling']:
data[i].fillna(data[i].mean(),inplace=True)
C:\Users\rithe\AppData\Local\Temp\ipykernel_13764\272504829.py:6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
data[i].fillna(data[i].mean(),inplace=True)
In [343]:
data.isnull().sum()
Out[343]:
Country 0 Year 0 Status 0 Life expectancy 10 Adult Mortality 0 infant deaths 0 Alcohol 0 percentage expenditure 0 Hepatitis B 0 Measles 0 BMI 0 under-five deaths 0 Polio 0 Total expenditure 0 Diphtheria 0 HIV/AIDS 0 GDP 0 Population 0 thinness 1-19 years 0 thinness 5-9 years 0 Income composition of resources 0 Schooling 0 dtype: int64
In [344]:
for i in data.select_dtypes(include='number').columns:
sns.boxplot(data=data,x=i)
plt.show()
In [345]:
s1=data.select_dtypes(include='number').corr()
In [346]:
plt.figure(figsize=(10,12))
sns.heatmap(s1,annot=True)
Out[346]:
<Axes: >
In [347]:
plt.pie(satus1,labels=satus1.index,autopct='%1.1f%%',startangle=90)
plt.show()
remove the column spaces
In [348]:
data.columns = data.columns.str.strip()
In [349]:
data.columns
Out[349]:
Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness 1-19 years',
'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
dtype='object')
In [350]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2938 entries, 0 to 2937 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2938 non-null object 1 Year 2938 non-null int64 2 Status 2938 non-null object 3 Life expectancy 2928 non-null float64 4 Adult Mortality 2938 non-null float64 5 infant deaths 2938 non-null int64 6 Alcohol 2938 non-null float64 7 percentage expenditure 2938 non-null float64 8 Hepatitis B 2938 non-null float64 9 Measles 2938 non-null int64 10 BMI 2938 non-null float64 11 under-five deaths 2938 non-null int64 12 Polio 2938 non-null float64 13 Total expenditure 2938 non-null float64 14 Diphtheria 2938 non-null float64 15 HIV/AIDS 2938 non-null float64 16 GDP 2938 non-null float64 17 Population 2938 non-null float64 18 thinness 1-19 years 2938 non-null float64 19 thinness 5-9 years 2938 non-null float64 20 Income composition of resources 2938 non-null float64 21 Schooling 2938 non-null float64 dtypes: float64(16), int64(4), object(2) memory usage: 505.1+ KB
Now we are doing Label encoding for the Categorical Data
In [351]:
from sklearn.preprocessing import LabelEncoder
In [352]:
le = LabelEncoder()
data['Country_encoded'] = le.fit_transform(data['Country'])
In [353]:
data[['Country', 'Country_encoded']]
Out[353]:
| Country | Country_encoded | |
|---|---|---|
| 0 | Afghanistan | 0 |
| 1 | Afghanistan | 0 |
| 2 | Afghanistan | 0 |
| 3 | Afghanistan | 0 |
| 4 | Afghanistan | 0 |
| ... | ... | ... |
| 2933 | Zimbabwe | 192 |
| 2934 | Zimbabwe | 192 |
| 2935 | Zimbabwe | 192 |
| 2936 | Zimbabwe | 192 |
| 2937 | Zimbabwe | 192 |
2938 rows × 2 columns
In [354]:
le = LabelEncoder()
data['Status_encoded'] = le.fit_transform(data['Status'])
In [355]:
data[['Status', 'Status_encoded']]
Out[355]:
| Status | Status_encoded | |
|---|---|---|
| 0 | Developing | 1 |
| 1 | Developing | 1 |
| 2 | Developing | 1 |
| 3 | Developing | 1 |
| 4 | Developing | 1 |
| ... | ... | ... |
| 2933 | Developing | 1 |
| 2934 | Developing | 1 |
| 2935 | Developing | 1 |
| 2936 | Developing | 1 |
| 2937 | Developing | 1 |
2938 rows × 2 columns
In [356]:
data.head(20)
Out[356]:
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | 19.1 | 83 | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 | 0 | 1 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | 18.6 | 86 | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 | 0 | 1 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | 18.1 | 89 | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 | 0 | 1 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | 17.6 | 93 | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 | 0 | 1 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | 17.2 | 97 | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 | 0 | 1 |
| 5 | Afghanistan | 2010 | Developing | 58.8 | 279.0 | 74 | 0.01 | 79.679367 | 66.0 | 1989 | 16.7 | 102 | 66.0 | 9.20 | 66.0 | 0.1 | 553.328940 | 2883167.0 | 18.4 | 18.4 | 0.448 | 9.2 | 0 | 1 |
| 6 | Afghanistan | 2009 | Developing | 58.6 | 281.0 | 77 | 0.01 | 56.762217 | 63.0 | 2861 | 16.2 | 106 | 63.0 | 9.42 | 63.0 | 0.1 | 445.893298 | 284331.0 | 18.6 | 18.7 | 0.434 | 8.9 | 0 | 1 |
| 7 | Afghanistan | 2008 | Developing | 58.1 | 287.0 | 80 | 0.03 | 25.873925 | 64.0 | 1599 | 15.7 | 110 | 64.0 | 8.33 | 64.0 | 0.1 | 373.361116 | 2729431.0 | 18.8 | 18.9 | 0.433 | 8.7 | 0 | 1 |
| 8 | Afghanistan | 2007 | Developing | 57.5 | 295.0 | 82 | 0.02 | 10.910156 | 63.0 | 1141 | 15.2 | 113 | 63.0 | 6.73 | 63.0 | 0.1 | 369.835796 | 26616792.0 | 19.0 | 19.1 | 0.415 | 8.4 | 0 | 1 |
| 9 | Afghanistan | 2006 | Developing | 57.3 | 295.0 | 84 | 0.03 | 17.171518 | 64.0 | 1990 | 14.7 | 116 | 58.0 | 7.43 | 58.0 | 0.1 | 272.563770 | 2589345.0 | 19.2 | 19.3 | 0.405 | 8.1 | 0 | 1 |
| 10 | Afghanistan | 2005 | Developing | 57.3 | 291.0 | 85 | 0.02 | 1.388648 | 66.0 | 1296 | 14.2 | 118 | 58.0 | 8.70 | 58.0 | 0.1 | 25.294130 | 257798.0 | 19.3 | 19.5 | 0.396 | 7.9 | 0 | 1 |
| 11 | Afghanistan | 2004 | Developing | 57.0 | 293.0 | 87 | 0.02 | 15.296066 | 67.0 | 466 | 13.8 | 120 | 5.0 | 8.79 | 5.0 | 0.1 | 219.141353 | 24118979.0 | 19.5 | 19.7 | 0.381 | 6.8 | 0 | 1 |
| 12 | Afghanistan | 2003 | Developing | 56.7 | 295.0 | 87 | 0.01 | 11.089053 | 65.0 | 798 | 13.4 | 122 | 41.0 | 8.82 | 41.0 | 0.1 | 198.728544 | 2364851.0 | 19.7 | 19.9 | 0.373 | 6.5 | 0 | 1 |
| 13 | Afghanistan | 2002 | Developing | 56.2 | 3.0 | 88 | 0.01 | 16.887351 | 64.0 | 2486 | 13.0 | 122 | 36.0 | 7.76 | 36.0 | 0.1 | 187.845950 | 21979923.0 | 19.9 | 2.2 | 0.341 | 6.2 | 0 | 1 |
| 14 | Afghanistan | 2001 | Developing | 55.3 | 316.0 | 88 | 0.01 | 10.574728 | 63.0 | 8762 | 12.6 | 122 | 35.0 | 7.80 | 33.0 | 0.1 | 117.496980 | 2966463.0 | 2.1 | 2.4 | 0.340 | 5.9 | 0 | 1 |
| 15 | Afghanistan | 2000 | Developing | 54.8 | 321.0 | 88 | 0.01 | 10.424960 | 62.0 | 6532 | 12.2 | 122 | 24.0 | 8.20 | 24.0 | 0.1 | 114.560000 | 293756.0 | 2.3 | 2.5 | 0.338 | 5.5 | 0 | 1 |
| 16 | Albania | 2015 | Developing | 77.8 | 74.0 | 0 | 4.60 | 364.975229 | 99.0 | 0 | 58.0 | 0 | 99.0 | 6.00 | 99.0 | 0.1 | 3954.227830 | 28873.0 | 1.2 | 1.3 | 0.762 | 14.2 | 1 | 1 |
| 17 | Albania | 2014 | Developing | 77.5 | 8.0 | 0 | 4.51 | 428.749067 | 98.0 | 0 | 57.2 | 1 | 98.0 | 5.88 | 98.0 | 0.1 | 4575.763787 | 288914.0 | 1.2 | 1.3 | 0.761 | 14.2 | 1 | 1 |
| 18 | Albania | 2013 | Developing | 77.2 | 84.0 | 0 | 4.76 | 430.876979 | 99.0 | 0 | 56.5 | 1 | 99.0 | 5.66 | 99.0 | 0.1 | 4414.723140 | 289592.0 | 1.3 | 1.4 | 0.759 | 14.2 | 1 | 1 |
| 19 | Albania | 2012 | Developing | 76.9 | 86.0 | 0 | 5.14 | 412.443356 | 99.0 | 9 | 55.8 | 1 | 99.0 | 5.59 | 99.0 | 0.1 | 4247.614380 | 2941.0 | 1.3 | 1.4 | 0.752 | 14.2 | 1 | 1 |
Now we will Drop the columns which are not required
In [357]:
data3=data.drop(['Adult Mortality','Status','infant deaths','percentage expenditure','Measles','under-five deaths','GDP','Country','Income composition of resources','thinness 5-9 years','Schooling'] ,axis=1)
In [358]:
data3.head(20)
Out[358]:
| Year | Life expectancy | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 65.0 | 0.01 | 65.0 | 19.1 | 6.0 | 8.16 | 65.0 | 0.1 | 33736494.0 | 17.2 | 0 | 1 |
| 1 | 2014 | 59.9 | 0.01 | 62.0 | 18.6 | 58.0 | 8.18 | 62.0 | 0.1 | 327582.0 | 17.5 | 0 | 1 |
| 2 | 2013 | 59.9 | 0.01 | 64.0 | 18.1 | 62.0 | 8.13 | 64.0 | 0.1 | 31731688.0 | 17.7 | 0 | 1 |
| 3 | 2012 | 59.5 | 0.01 | 67.0 | 17.6 | 67.0 | 8.52 | 67.0 | 0.1 | 3696958.0 | 17.9 | 0 | 1 |
| 4 | 2011 | 59.2 | 0.01 | 68.0 | 17.2 | 68.0 | 7.87 | 68.0 | 0.1 | 2978599.0 | 18.2 | 0 | 1 |
| 5 | 2010 | 58.8 | 0.01 | 66.0 | 16.7 | 66.0 | 9.20 | 66.0 | 0.1 | 2883167.0 | 18.4 | 0 | 1 |
| 6 | 2009 | 58.6 | 0.01 | 63.0 | 16.2 | 63.0 | 9.42 | 63.0 | 0.1 | 284331.0 | 18.6 | 0 | 1 |
| 7 | 2008 | 58.1 | 0.03 | 64.0 | 15.7 | 64.0 | 8.33 | 64.0 | 0.1 | 2729431.0 | 18.8 | 0 | 1 |
| 8 | 2007 | 57.5 | 0.02 | 63.0 | 15.2 | 63.0 | 6.73 | 63.0 | 0.1 | 26616792.0 | 19.0 | 0 | 1 |
| 9 | 2006 | 57.3 | 0.03 | 64.0 | 14.7 | 58.0 | 7.43 | 58.0 | 0.1 | 2589345.0 | 19.2 | 0 | 1 |
| 10 | 2005 | 57.3 | 0.02 | 66.0 | 14.2 | 58.0 | 8.70 | 58.0 | 0.1 | 257798.0 | 19.3 | 0 | 1 |
| 11 | 2004 | 57.0 | 0.02 | 67.0 | 13.8 | 5.0 | 8.79 | 5.0 | 0.1 | 24118979.0 | 19.5 | 0 | 1 |
| 12 | 2003 | 56.7 | 0.01 | 65.0 | 13.4 | 41.0 | 8.82 | 41.0 | 0.1 | 2364851.0 | 19.7 | 0 | 1 |
| 13 | 2002 | 56.2 | 0.01 | 64.0 | 13.0 | 36.0 | 7.76 | 36.0 | 0.1 | 21979923.0 | 19.9 | 0 | 1 |
| 14 | 2001 | 55.3 | 0.01 | 63.0 | 12.6 | 35.0 | 7.80 | 33.0 | 0.1 | 2966463.0 | 2.1 | 0 | 1 |
| 15 | 2000 | 54.8 | 0.01 | 62.0 | 12.2 | 24.0 | 8.20 | 24.0 | 0.1 | 293756.0 | 2.3 | 0 | 1 |
| 16 | 2015 | 77.8 | 4.60 | 99.0 | 58.0 | 99.0 | 6.00 | 99.0 | 0.1 | 28873.0 | 1.2 | 1 | 1 |
| 17 | 2014 | 77.5 | 4.51 | 98.0 | 57.2 | 98.0 | 5.88 | 98.0 | 0.1 | 288914.0 | 1.2 | 1 | 1 |
| 18 | 2013 | 77.2 | 4.76 | 99.0 | 56.5 | 99.0 | 5.66 | 99.0 | 0.1 | 289592.0 | 1.3 | 1 | 1 |
| 19 | 2012 | 76.9 | 5.14 | 99.0 | 55.8 | 99.0 | 5.59 | 99.0 | 0.1 | 2941.0 | 1.3 | 1 | 1 |
In [359]:
data3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2938 entries, 0 to 2937 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 2938 non-null int64 1 Life expectancy 2928 non-null float64 2 Alcohol 2938 non-null float64 3 Hepatitis B 2938 non-null float64 4 BMI 2938 non-null float64 5 Polio 2938 non-null float64 6 Total expenditure 2938 non-null float64 7 Diphtheria 2938 non-null float64 8 HIV/AIDS 2938 non-null float64 9 Population 2938 non-null float64 10 thinness 1-19 years 2938 non-null float64 11 Country_encoded 2938 non-null int32 12 Status_encoded 2938 non-null int32 dtypes: float64(10), int32(2), int64(1) memory usage: 275.6 KB
Converting the Outliers to upper quartile or lower quartile range
In [360]:
numeric_cols=data3.select_dtypes(include='number').columns
In [361]:
for col in numeric_cols:
Q1 = data3[col].quantile(0.25)
Q3 = data3[col].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR
data3[col] = data3[col].clip(lower=lower_limit, upper=upper_limit)
In [362]:
data3.head()
Out[362]:
| Year | Life expectancy | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 65.0 | 0.01 | 65.0 | 19.1 | 49.5 | 8.16 | 65.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 1 | 2014 | 59.9 | 0.01 | 62.0 | 18.6 | 58.0 | 8.18 | 62.0 | 0.1 | 3.275820e+05 | 15.35 | 0 | 1 |
| 2 | 2013 | 59.9 | 0.01 | 64.0 | 18.1 | 62.0 | 8.13 | 64.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 3 | 2012 | 59.5 | 0.01 | 67.0 | 17.6 | 67.0 | 8.52 | 67.0 | 0.1 | 3.696958e+06 | 15.35 | 0 | 1 |
| 4 | 2011 | 59.2 | 0.01 | 68.0 | 17.2 | 68.0 | 7.87 | 68.0 | 0.1 | 2.978599e+06 | 15.35 | 0 | 1 |
In [363]:
df=data3.copy()
In [364]:
df.head(20)
Out[364]:
| Year | Life expectancy | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 65.0 | 0.01 | 65.0 | 19.1 | 49.5 | 8.16 | 65.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 1 | 2014 | 59.9 | 0.01 | 62.0 | 18.6 | 58.0 | 8.18 | 62.0 | 0.1 | 3.275820e+05 | 15.35 | 0 | 1 |
| 2 | 2013 | 59.9 | 0.01 | 64.0 | 18.1 | 62.0 | 8.13 | 64.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 3 | 2012 | 59.5 | 0.01 | 67.0 | 17.6 | 67.0 | 8.52 | 67.0 | 0.1 | 3.696958e+06 | 15.35 | 0 | 1 |
| 4 | 2011 | 59.2 | 0.01 | 68.0 | 17.2 | 68.0 | 7.87 | 68.0 | 0.1 | 2.978599e+06 | 15.35 | 0 | 1 |
| 5 | 2010 | 58.8 | 0.01 | 66.0 | 16.7 | 66.0 | 9.20 | 66.0 | 0.1 | 2.883167e+06 | 15.35 | 0 | 1 |
| 6 | 2009 | 58.6 | 0.01 | 63.0 | 16.2 | 63.0 | 9.42 | 63.0 | 0.1 | 2.843310e+05 | 15.35 | 0 | 1 |
| 7 | 2008 | 58.1 | 0.03 | 64.0 | 15.7 | 64.0 | 8.33 | 64.0 | 0.1 | 2.729431e+06 | 15.35 | 0 | 1 |
| 8 | 2007 | 57.5 | 0.02 | 63.0 | 15.2 | 63.0 | 6.73 | 63.0 | 0.1 | 2.661679e+07 | 15.35 | 0 | 1 |
| 9 | 2006 | 57.3 | 0.03 | 64.0 | 14.7 | 58.0 | 7.43 | 58.0 | 0.1 | 2.589345e+06 | 15.35 | 0 | 1 |
| 10 | 2005 | 57.3 | 0.02 | 66.0 | 14.2 | 58.0 | 8.70 | 58.0 | 0.1 | 2.577980e+05 | 15.35 | 0 | 1 |
| 11 | 2004 | 57.0 | 0.02 | 67.0 | 13.8 | 49.5 | 8.79 | 49.5 | 0.1 | 2.411898e+07 | 15.35 | 0 | 1 |
| 12 | 2003 | 56.7 | 0.01 | 65.0 | 13.4 | 49.5 | 8.82 | 49.5 | 0.1 | 2.364851e+06 | 15.35 | 0 | 1 |
| 13 | 2002 | 56.2 | 0.01 | 64.0 | 13.0 | 49.5 | 7.76 | 49.5 | 0.1 | 2.197992e+07 | 15.35 | 0 | 1 |
| 14 | 2001 | 55.3 | 0.01 | 63.0 | 12.6 | 49.5 | 7.80 | 49.5 | 0.1 | 2.966463e+06 | 2.10 | 0 | 1 |
| 15 | 2000 | 54.8 | 0.01 | 62.0 | 12.2 | 49.5 | 8.20 | 49.5 | 0.1 | 2.937560e+05 | 2.30 | 0 | 1 |
| 16 | 2015 | 77.8 | 4.60 | 99.0 | 58.0 | 99.0 | 6.00 | 99.0 | 0.1 | 2.887300e+04 | 1.20 | 1 | 1 |
| 17 | 2014 | 77.5 | 4.51 | 98.0 | 57.2 | 98.0 | 5.88 | 98.0 | 0.1 | 2.889140e+05 | 1.20 | 1 | 1 |
| 18 | 2013 | 77.2 | 4.76 | 99.0 | 56.5 | 99.0 | 5.66 | 99.0 | 0.1 | 2.895920e+05 | 1.30 | 1 | 1 |
| 19 | 2012 | 76.9 | 5.14 | 99.0 | 55.8 | 99.0 | 5.59 | 99.0 | 0.1 | 2.941000e+03 | 1.30 | 1 | 1 |
In [365]:
df1=data3.drop('Life expectancy',axis=1)
In [366]:
df1.head()
Out[366]:
| Year | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 0.01 | 65.0 | 19.1 | 49.5 | 8.16 | 65.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 1 | 2014 | 0.01 | 62.0 | 18.6 | 58.0 | 8.18 | 62.0 | 0.1 | 3.275820e+05 | 15.35 | 0 | 1 |
| 2 | 2013 | 0.01 | 64.0 | 18.1 | 62.0 | 8.13 | 64.0 | 0.1 | 3.125506e+07 | 15.35 | 0 | 1 |
| 3 | 2012 | 0.01 | 67.0 | 17.6 | 67.0 | 8.52 | 67.0 | 0.1 | 3.696958e+06 | 15.35 | 0 | 1 |
| 4 | 2011 | 0.01 | 68.0 | 17.2 | 68.0 | 7.87 | 68.0 | 0.1 | 2.978599e+06 | 15.35 | 0 | 1 |
In [367]:
for i in df1.select_dtypes(include='number').columns:
sns.boxplot(data=df1,x=i)
plt.show()
In [368]:
for i in df1.select_dtypes(include='number').columns:
sns.histplot(data=df1,x=i)
plt.show()
Standasation
In [369]:
from sklearn.preprocessing import StandardScaler
In [370]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data3)
standardized_data
Out[370]:
array([[ 1.6217623 , -0.44567214, -1.17336066, ..., 2.70888393,
-1.69104231, 0. ],
[ 1.40498625, -0.98260157, -1.17336066, ..., 2.70888393,
-1.69104231, 0. ],
[ 1.1882102 , -0.98260157, -1.17336066, ..., 2.70888393,
-1.69104231, 0. ],
...,
[-1.19632639, -2.5723338 , -0.04402256, ..., -0.89203308,
1.7231814 , 0. ],
[-1.41310244, -2.51969366, -0.73644479, ..., -0.79024037,
1.7231814 , 0. ],
[-1.62987849, -2.44599746, -0.74666504, ..., 1.60188824,
1.7231814 , 0. ]])
In [371]:
scaler = StandardScaler()
standardized_array = scaler.fit_transform(data3)
standardized_data = pd.DataFrame(standardized_array, columns=data3.columns, index=data3.index)
standardized_data
Out[371]:
| Year | Life expectancy | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.621762 | -0.445672 | -1.173361 | -1.534064 | -0.964715 | -2.265630 | 0.985324 | -1.282382 | -0.629209 | 2.650045 | 2.708884 | -1.691042 | 0.0 |
| 1 | 1.404986 | -0.982602 | -1.173361 | -1.768413 | -0.989810 | -1.727535 | 0.994061 | -1.472525 | -0.629209 | -0.826718 | 2.708884 | -1.691042 | 0.0 |
| 2 | 1.188210 | -0.982602 | -1.173361 | -1.612181 | -1.014905 | -1.474314 | 0.972218 | -1.345763 | -0.629209 | 2.650045 | 2.708884 | -1.691042 | 0.0 |
| 3 | 0.971434 | -1.024714 | -1.173361 | -1.377832 | -1.040000 | -1.157788 | 1.142595 | -1.155619 | -0.629209 | -0.447944 | 2.708884 | -1.691042 | 0.0 |
| 4 | 0.754658 | -1.056298 | -1.173361 | -1.299715 | -1.060076 | -1.094482 | 0.858634 | -1.092238 | -0.629209 | -0.528699 | 2.708884 | -1.691042 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | -0.762774 | -2.624974 | -0.061908 | -1.299715 | -0.563194 | -1.157788 | 0.535355 | -1.282382 | 1.895041 | 0.572861 | 1.194717 | 1.723181 | 0.0 |
| 2934 | -0.979550 | -2.603918 | -0.138560 | -2.053448 | -0.583270 | -2.265630 | 0.268868 | -1.092238 | 1.895041 | 0.556717 | 1.296510 | 1.723181 | 0.0 |
| 2935 | -1.196326 | -2.572334 | -0.044023 | -0.909134 | -0.603346 | -0.777956 | 0.273237 | -0.902095 | 1.895041 | -0.849432 | -0.892033 | 1.723181 | 0.0 |
| 2936 | -1.413102 | -2.519694 | -0.736445 | -0.674785 | -0.623422 | -0.588040 | 0.111598 | -0.648570 | 1.895041 | 0.526619 | -0.790240 | 1.723181 | 0.0 |
| 2937 | -1.629878 | -2.445997 | -0.746665 | -0.440436 | -0.643498 | -0.461429 | 0.522249 | -0.458427 | 1.895041 | 0.510441 | 1.601888 | 1.723181 | 0.0 |
2938 rows × 13 columns
In [372]:
for i in standardized_data.select_dtypes(include='number').columns:
sns.histplot(data=standardized_data,x=i)
plt.show()
In [373]:
for i in standardized_data.select_dtypes(include='number').columns:
sns.boxplot(data=standardized_data,x=i)
plt.show()
Normalization
In [374]:
from sklearn.preprocessing import MinMaxScaler
In [375]:
scaler =MinMaxScaler()
normalized_data = scaler.fit_transform(data3)
normalized_data
Out[375]:
array([[1. , 0.46428571, 0. , ..., 1. , 0. ,
0. ],
[0.93333333, 0.35044643, 0. , ..., 1. , 0. ,
0. ],
[0.86666667, 0.35044643, 0. , ..., 1. , 0. ,
0. ],
...,
[0.13333333, 0.01339286, 0.26268479, ..., 0.07213115, 1. ,
0. ],
[0.06666667, 0.02455357, 0.10162692, ..., 0.09836066, 1. ,
0. ],
[0. , 0.04017857, 0.09924968, ..., 0.7147541 , 1. ,
0. ]])
In [376]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data3)
normalized_data = pd.DataFrame(normalized_data, columns=data3.columns, index=data3.index)
normalized_data
Out[376]:
| Year | Life expectancy | Alcohol | Hepatitis B | BMI | Polio | Total expenditure | Diphtheria | HIV/AIDS | Population | thinness 1-19 years | Country_encoded | Status_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | 0.464286 | 0.000000 | 0.163568 | 0.209733 | 0.000000 | 0.683333 | 0.313131 | 0.0 | 1.000000 | 1.000000 | 0.0 | 0.0 |
| 1 | 0.933333 | 0.350446 | 0.000000 | 0.089765 | 0.203940 | 0.171717 | 0.685088 | 0.252525 | 0.0 | 0.010480 | 1.000000 | 0.0 | 0.0 |
| 2 | 0.866667 | 0.350446 | 0.000000 | 0.138967 | 0.198146 | 0.252525 | 0.680702 | 0.292929 | 0.0 | 1.000000 | 1.000000 | 0.0 | 0.0 |
| 3 | 0.800000 | 0.341518 | 0.000000 | 0.212770 | 0.192352 | 0.353535 | 0.714912 | 0.353535 | 0.0 | 0.118283 | 1.000000 | 0.0 | 0.0 |
| 4 | 0.733333 | 0.334821 | 0.000000 | 0.237371 | 0.187717 | 0.373737 | 0.657895 | 0.373737 | 0.0 | 0.095299 | 1.000000 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | 0.266667 | 0.002232 | 0.258525 | 0.237371 | 0.302433 | 0.353535 | 0.592982 | 0.313131 | 1.0 | 0.408813 | 0.609836 | 1.0 | 0.0 |
| 2934 | 0.200000 | 0.006696 | 0.240695 | 0.000000 | 0.297798 | 0.000000 | 0.539474 | 0.373737 | 1.0 | 0.404219 | 0.636066 | 1.0 | 0.0 |
| 2935 | 0.133333 | 0.013393 | 0.262685 | 0.360375 | 0.293163 | 0.474747 | 0.540351 | 0.434343 | 1.0 | 0.004015 | 0.072131 | 1.0 | 0.0 |
| 2936 | 0.066667 | 0.024554 | 0.101627 | 0.434178 | 0.288528 | 0.535354 | 0.507895 | 0.515152 | 1.0 | 0.395653 | 0.098361 | 1.0 | 0.0 |
| 2937 | 0.000000 | 0.040179 | 0.099250 | 0.507981 | 0.283893 | 0.575758 | 0.590351 | 0.575758 | 1.0 | 0.391048 | 0.714754 | 1.0 | 0.0 |
2938 rows × 13 columns
In [377]:
for i in normalized_data.select_dtypes(include='number').columns:
sns.boxplot(data=normalized_data,x=i)
plt.show()
In [378]:
for i in normalized_data.select_dtypes(include='number').columns:
sns.histplot(data=normalized_data,x=i ,kde=True)
plt.show()
In [379]:
c1=df1.select_dtypes(include='number').corr()
In [380]:
plt.figure(figsize=(10,15))
sns.heatmap(c1,annot=True)
Out[380]:
<Axes: >
In [381]:
df1=df1.drop(['Population','HIV/AIDS'],axis=1)
In [382]:
sns.pairplot(df1)
Out[382]:
<seaborn.axisgrid.PairGrid at 0x1b7e028c230>
In [383]:
sns.pairplot(df1,diag_kind='kde')
Out[383]:
<seaborn.axisgrid.PairGrid at 0x1b7e3cb1a60>
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: